This file is almost a duplicate of SimulationStudySummary.Rmd but aimed at comparing simulation study results of Tau and 1/2Tau cases.
Show the PSJS estimated results for Simulated datasets with estimated tau
realized.tract.dist <- function(L, p){
x <- 1:(L-1)
dist.1 <- (2 + (L-1-x)*p)/(1.0/p + L -1.)*(1-p)^(x-1)
dist.L <- 1/p/(1.0/p + L -1.)*(1-p)^(L-1)
dist <- c(dist.1, dist.L)
mean.L <- sum(x * dist.1) + L*dist.L
var.L <- sum(x^2 * dist.1) + L^2*dist.L - mean.L^2
return(list(dist = dist, mean = mean.L, sd = sqrt(var.L)))
}
Tract.list <- c(3.0, 10.0, 50.0, 100.0, 300.0)
for(tract in Tract.list){
target_summary <- get(paste("PSJS_HKY_Tract_", toString(tract), "_combined_summary", sep = ""))
col.names <- target_summary["tract_length", ] < 10*tract
sim_info <- get(paste("sim.tract.", toString(tract), sep = ""))
hist(target_summary["tract_length", col.names], breaks = 50,
main = paste("PSJS Estimated Tract length 1/p, Tract = ", toString(tract), ".0 ", sep = ""))
#abline(v = realized.tract.dist(492, 1.0/tract)$mean, col = "blue")
abline(v = tract, col = 2)
abline(v = mean(sim_info["mean subtract length", ]), col = "green")
HalfTau.target_summary <- get(paste("HalfTau_PSJS_HKY_Tract_", toString(tract), "_combined_summary", sep = ""))
HalfTau.col.names <- HalfTau.target_summary["tract_length", ] < 10*tract
HalfTau.sim_info <- get(paste("HalfTau.sim.tract.", toString(tract), sep = ""))
hist(HalfTau.target_summary["tract_length", HalfTau.col.names], breaks = 50,
main = paste("Half Tau PSJS Estimated Tract length 1/p, Tract = ", toString(tract), ".0 ", sep = ""))
abline(v = tract, col = 2)
TenthTau.target_summary <- get(paste("TenthTau_PSJS_HKY_Tract_", toString(tract), "_combined_summary", sep = ""))
TenthTau.col.names <- TenthTau.target_summary["tract_length", ] < 10*tract
TenthTau.sim_info <- get(paste("TenthTau.sim.tract.", toString(tract), sep = ""))
hist(TenthTau.target_summary["tract_length", TenthTau.col.names], breaks = 50,
main = paste("Tenth Tau PSJS Estimated Tract length 1/p, Tract = ", toString(tract), ".0 ", sep = ""))
abline(v = tract, col = 2)
print(paste("Tract = ", toString(tract), ".0 combined PSJS HKY Results", sep = ""))
print("Real value used in simulation: Tau = 22.581687801812059, r2 = 0.53919187294821969, r3 = 11.580030840998282")
matrix.print <- matrix(c(sum(TenthTau.col.names), sum(HalfTau.col.names), sum(col.names),
mean(TenthTau.target_summary["tract_length", TenthTau.col.names]), mean(HalfTau.target_summary["tract_length", HalfTau.col.names]), mean(target_summary["tract_length", col.names]),
sd(TenthTau.target_summary["tract_length", TenthTau.col.names]), sd(HalfTau.target_summary["tract_length", HalfTau.col.names]), sd(target_summary["tract_length", col.names]),
# mean(HalfTau.sim_info["mean subtract length", HalfTau.col.names]), mean(sim_info["mean subtract length", col.names]),
# sd(HalfTau.sim_info["mean subtract length", HalfTau.col.names]), sd(sim_info["mean subtract length", col.names])),
mean(TenthTau.target_summary["tract_length", TenthTau.col.names] * TenthTau.target_summary["init_rate", TenthTau.col.names]), mean(HalfTau.target_summary["tract_length", HalfTau.col.names] * HalfTau.target_summary["init_rate", HalfTau.col.names]), mean(target_summary["tract_length", col.names] * target_summary["init_rate", col.names]),
sd(TenthTau.target_summary["tract_length", TenthTau.col.names] * TenthTau.target_summary["init_rate", TenthTau.col.names]), sd(HalfTau.target_summary["tract_length", HalfTau.col.names] * HalfTau.target_summary["init_rate", HalfTau.col.names]), sd(target_summary["tract_length", col.names] * target_summary["init_rate", col.names]),
mean(TenthTau.target_summary["r2", TenthTau.col.names]), mean(HalfTau.target_summary["r2", HalfTau.col.names]), mean(target_summary["r2", col.names]),
sd(TenthTau.target_summary["r2", TenthTau.col.names]), sd(HalfTau.target_summary["r2", HalfTau.col.names]), sd(target_summary["r2", col.names]),
mean(TenthTau.target_summary["r3", TenthTau.col.names]), mean(HalfTau.target_summary["r3", HalfTau.col.names]), mean(target_summary["r3", col.names]),
sd(TenthTau.target_summary["r3", TenthTau.col.names]), sd(HalfTau.target_summary["r3", HalfTau.col.names]), sd(target_summary["r3", col.names])
),
3, 9)
colnames(matrix.print) <- c("Total samples","mean tract length","sd tract length","mean Tau","sd Tau",
"mean r2", "sd r2", "mean r3", "sd r3")
rownames(matrix.print) <- c("TenthTau", "HalfTau", "Tau")
print("Now showing summary of estimates")
print(matrix.print)
}
## [1] "Tract = 3.0 combined PSJS HKY Results"
## [1] "Real value used in simulation: Tau = 22.581687801812059, r2 = 0.53919187294821969, r3 = 11.580030840998282"
## [1] "Now showing summary of estimates"
## Total samples mean tract length sd tract length mean Tau
## TenthTau 91 3.283349 3.312316 2.356514
## HalfTau 100 2.922520 1.113955 12.200990
## Tau 100 2.793908 1.038238 25.032230
## sd Tau mean r2 sd r2 mean r3 sd r3
## TenthTau 1.656687 0.6051389 0.2798352 12.39054 3.558901
## HalfTau 4.434941 0.5855221 0.2439452 12.77058 3.795465
## Tau 8.287523 0.5879993 0.2835385 13.20915 4.000814
## [1] "Tract = 10.0 combined PSJS HKY Results"
## [1] "Real value used in simulation: Tau = 22.581687801812059, r2 = 0.53919187294821969, r3 = 11.580030840998282"
## [1] "Now showing summary of estimates"
## Total samples mean tract length sd tract length mean Tau
## TenthTau 98 11.537135 14.047343 2.505714
## HalfTau 100 10.000468 4.353661 12.146068
## Tau 99 8.624418 3.405958 24.368881
## sd Tau mean r2 sd r2 mean r3 sd r3
## TenthTau 1.964019 0.5838685 0.2805556 12.38993 3.309168
## HalfTau 4.777245 0.5890073 0.2211978 12.26681 3.183922
## Tau 7.349466 0.5496428 0.2215865 12.09983 2.812596
## [1] "Tract = 50.0 combined PSJS HKY Results"
## [1] "Real value used in simulation: Tau = 22.581687801812059, r2 = 0.53919187294821969, r3 = 11.580030840998282"
## [1] "Now showing summary of estimates"
## Total samples mean tract length sd tract length mean Tau
## TenthTau 98 87.35219 152.55136 2.633865
## HalfTau 98 34.97746 17.35117 12.700596
## Tau 99 37.33633 18.09137 25.787994
## sd Tau mean r2 sd r2 mean r3 sd r3
## TenthTau 2.644350 0.6531514 0.2840591 12.77153 3.485084
## HalfTau 7.863899 0.5979565 0.2700079 12.82955 3.631304
## Tau 14.612228 0.5890735 0.2289175 12.32934 3.922138
## [1] "Tract = 100.0 combined PSJS HKY Results"
## [1] "Real value used in simulation: Tau = 22.581687801812059, r2 = 0.53919187294821969, r3 = 11.580030840998282"
## [1] "Now showing summary of estimates"
## Total samples mean tract length sd tract length mean Tau
## TenthTau 99 114.38781 175.91309 3.606949
## HalfTau 98 59.38336 56.16456 13.671732
## Tau 98 63.16095 30.73807 27.120792
## sd Tau mean r2 sd r2 mean r3 sd r3
## TenthTau 5.114314 0.5972691 0.2476132 12.86846 3.618311
## HalfTau 9.658842 0.5570442 0.2107733 12.23641 3.095682
## Tau 22.528654 0.6138259 0.3042493 12.81388 4.645047
## [1] "Tract = 300.0 combined PSJS HKY Results"
## [1] "Real value used in simulation: Tau = 22.581687801812059, r2 = 0.53919187294821969, r3 = 11.580030840998282"
## [1] "Now showing summary of estimates"
## Total samples mean tract length sd tract length mean Tau
## TenthTau 99 180.6429 209.6309 2.927074
## HalfTau 99 167.4497 215.7783 14.947028
## Tau 97 180.2817 291.7673 30.643917
## sd Tau mean r2 sd r2 mean r3 sd r3
## TenthTau 5.246772 0.5541338 0.2031755 12.35401 2.981998
## HalfTau 12.247934 0.5722834 0.2303773 12.26209 3.317833
## Tau 20.181477 0.6089437 0.2580605 12.39773 3.244111
Now see if the bias can be corrected by analyzing each half of the alignment Correction uses n choose 2 as sample size
Tract.list <- c(50.0, 100.0, 300.0)
n1 <- choose(489, 2)
n2 <- choose(243, 2)
n3 <- choose(246, 2)
for(tract in Tract.list){
first.half.summary <- get(paste("first_half_PSJS_HKY_Tract_", tract, "_combined_summary", sep = ""))
second.half.summary <- get(paste("second_half_PSJS_HKY_Tract_", tract, "_combined_summary", sep = ""))
target.summary <- get(paste("PSJS_HKY_Tract_", tract, "_combined_summary", sep = ""))
col.names <- intersect(intersect(colnames(first.half.summary), colnames(second.half.summary)), colnames(target.summary))
first.half.corrected <- (n1 - n2) /(n1 * 1./target.summary["tract_length", col.names] - n2 * 1.0/first.half.summary["tract_length", col.names])
second.half.corrected <- (n1 - n3) / (n1 * 1./target.summary["tract_length", col.names] - n3 * 1./second.half.summary["tract_length", col.names])
# first.half.corrected[first.half.corrected<target.summary["tract_length", col.names]] <- target.summary["tract_length",col.names[first.half.corrected<target.summary["tract_length", col.names]]]
# second.half.corrected[second.half.corrected<target.summary["tract_length", col.names]] <- target.summary["tract_length",col.names[second.half.corrected<target.summary["tract_length", col.names]]]
#
hist(target.summary["tract_length", col.names], breaks = 50,
main = paste("PSJS Estimated Tract length, Tract = ", toString(tract), ".0 ", sep = ""))
hist(first.half.corrected, breaks = 50,
main = paste("First half corrected PSJS Estimated Tract length, Tract = ", toString(tract), ".0 ", sep = ""))
hist(second.half.corrected, breaks = 50,
main = paste("Second half corrected PSJS Estimated Tract length, Tract = ", toString(tract), ".0 ", sep = ""))
hist((first.half.corrected + second.half.corrected) / 2, breaks = 50,
main = paste("Both half corrected PSJS Estimated Tract length, Tract = ", toString(tract), ".0 ", sep = ""))
plot(target.summary["tract_length", col.names], (first.half.corrected + second.half.corrected) / 2,
xlab = "PSJS estimated", ylab = "Both half corrected",
main = paste("Both half corrected vs PSJS Estimated Tract length, Tract = ", toString(tract), ".0 ", sep = ""))
abline(a = 0, b = 1)
abline(h = tract)
n.list <- c(rep(n1, length(col.names)), rep(n2, length(col.names)), rep(n3, length(col.names)))
ni.theta <- c(n1*log(target.summary["tract_length", col.names]), n2*log(first.half.summary["tract_length", col.names]), n3*log(second.half.summary["tract_length", col.names]))
theta.list <- c(log(target.summary["tract_length", col.names]), log(first.half.summary["tract_length", col.names]), log(second.half.summary["tract_length", col.names]))
plot(n.list, ni.theta,
main = paste("-ni*log(p) vs ni, Tract = ", toString(tract), ".0 ", sep = ""))
# OK, this violates the constant variance assumptiong of linear regression
fit <- lm(ni.theta ~ n.list)
y <- ni.theta / sqrt(n.list)
x1 <- sqrt(n.list)
x2 <- 1/sqrt(n.list)
fit2 <- lm(y~0+x1+x2)
abline(a = fit$coefficients[1], b = fit$coefficients[2], col = "red")
plot(1/n.list, theta.list,
main = paste("-log(p) vs 1/ni, Tract = ", toString(tract), ".0 ", sep = ""))
x3 <- 1/n.list
fit3 <- lm(theta.list~x3)
abline(a=fit3$coefficients[1], b=fit3$coefficients[2], col="red")
print(fit$coefficients)
print(c(exp(fit$coefficients[2]), exp(fit2$coefficients[1]), exp(fit3$coefficients[1])))
print(c(mean(target.summary["tract_length", col.names])/exp(fit$coefficients[1]/n1),
mean(target.summary["tract_length", col.names])/exp(fit2$coefficients[2]/n1),
mean(target.summary["tract_length", col.names])/exp(fit3$coefficients[2]/n1)))
print.matrix <- matrix(c(mean(target.summary["tract_length", col.names]), sd(target.summary["tract_length", col.names]),
mean(first.half.corrected), sd(first.half.corrected),
mean(second.half.corrected), sd(second.half.corrected),
mean((first.half.corrected + second.half.corrected) / 2),
sd(first.half.corrected + second.half.corrected) / 2),
4, 2, byrow = TRUE)
colnames(print.matrix) <- c("mean", "sd")
rownames(print.matrix) <- c("PSJS estimated", "Corrected using first half",
"Corrected using second half", "Corrected using both")
print(print.matrix)
}
## (Intercept) n.list
## -9019.770188 3.574033
## n.list x1 (Intercept)
## 35.66014 35.68844 35.77358
## (Intercept) x2 x3
## 39.23174 39.24729 39.27842
## mean sd
## PSJS estimated 36.37531 16.56685
## Corrected using first half 27.47277 58.83801
## Corrected using second half 44.38371 32.16274
## Corrected using both 35.92824 34.14067
## (Intercept) n.list
## -9373.163328 4.116731
## n.list x1 (Intercept)
## 61.35833 61.32944 61.24278
## (Intercept) x2 x3
## 68.68128 68.66512 68.63282
## mean sd
## PSJS estimated 63.49233 31.42831
## Corrected using first half 42.72633 211.68845
## Corrected using second half 45.93008 194.32619
## Corrected using both 44.32820 144.13719
## (Intercept) n.list
## 2681.103380 4.692135
## n.list x1 (Intercept)
## 109.0859 109.0523 108.9514
## (Intercept) x2 x3
## 246.2427 246.2047 246.1290
## mean sd
## PSJS estimated 251.8385 746.9723
## Corrected using first half 604.9041 3271.9091
## Corrected using second half -220.5940 2831.3043
## Corrected using both 192.1551 1885.2528
Now see if the bias can be corrected by analyzing each half of the alignment Correction uses n as sample size
Tract.list <- c(50.0, 100.0, 300.0)
n1 <- 489
n2 <- 243
n3 <- 246
for(tract in Tract.list){
first.half.summary <- get(paste("first_half_PSJS_HKY_Tract_", tract, "_combined_summary", sep = ""))
second.half.summary <- get(paste("second_half_PSJS_HKY_Tract_", tract, "_combined_summary", sep = ""))
target.summary <- get(paste("PSJS_HKY_Tract_", tract, "_combined_summary", sep = ""))
col.names <- intersect(intersect(colnames(first.half.summary), colnames(second.half.summary)), colnames(target.summary))
first.half.corrected <- (n1 - n2) /(n1 * 1./target.summary["tract_length", col.names] - n2 * 1.0/first.half.summary["tract_length", col.names])
second.half.corrected <- (n1 - n3) / (n1 * 1./target.summary["tract_length", col.names] - n3 * 1./second.half.summary["tract_length", col.names])
# first.half.corrected[first.half.corrected<target.summary["tract_length", col.names]] <- target.summary["tract_length",col.names[first.half.corrected<target.summary["tract_length", col.names]]]
# second.half.corrected[second.half.corrected<target.summary["tract_length", col.names]] <- target.summary["tract_length",col.names[second.half.corrected<target.summary["tract_length", col.names]]]
hist(target.summary["tract_length", col.names], breaks = 50,
main = paste("PSJS Estimated Tract length, Tract = ", toString(tract), ".0 ", sep = ""))
hist(first.half.corrected, breaks = 50,
main = paste("First half corrected PSJS Estimated Tract length, Tract = ", toString(tract), ".0 ", sep = ""))
hist(second.half.corrected, breaks = 50,
main = paste("Second half corrected PSJS Estimated Tract length, Tract = ", toString(tract), ".0 ", sep = ""))
hist((first.half.corrected + second.half.corrected) / 2, breaks = 50,
main = paste("Both half corrected PSJS Estimated Tract length, Tract = ", toString(tract), ".0 ", sep = ""))
plot(target.summary["tract_length", col.names], (first.half.corrected + second.half.corrected) / 2,
xlab = "PSJS estimated", ylab = "Both half corrected",
main = paste("Both half corrected vs PSJS Estimated Tract length, Tract = ", toString(tract), ".0 ", sep = ""))
abline(a = 0, b = 1)
abline(h = tract)
n.list <- c(rep(n1, length(col.names)), rep(n2, length(col.names)), rep(n3, length(col.names)))
ni.theta <- c(n1*log(target.summary["tract_length", col.names]), n2*log(first.half.summary["tract_length", col.names]), n3*log(second.half.summary["tract_length", col.names]))
theta.list <- c(log(target.summary["tract_length", col.names]), log(first.half.summary["tract_length", col.names]), log(second.half.summary["tract_length", col.names]))
plot(n.list, ni.theta,
main = paste("-ni*log(p) vs ni, Tract = ", toString(tract), ".0 ", sep = ""))
# OK, this violates the constant variance assumptiong of linear regression
fit <- lm(ni.theta ~ n.list)
y <- ni.theta / sqrt(n.list)
x1 <- sqrt(n.list)
x2 <- 1/sqrt(n.list)
fit2 <- lm(y~0+x1+x2)
abline(a = fit$coefficients[1], b = fit$coefficients[2], col = "red")
plot(1/n.list, theta.list,
main = paste("-log(p) vs 1/ni, Tract = ", toString(tract), ".0 ", sep = ""))
x3 <- 1/n.list
fit3 <- lm(theta.list~x3)
abline(a=fit3$coefficients[1], b=fit3$coefficients[2], col="red")
print(fit$coefficients)
print(c(exp(fit$coefficients[2]), exp(fit2$coefficients[1]), exp(fit3$coefficients[1])))
print(c(mean(target.summary["tract_length", col.names])/exp(fit$coefficients[1]/n1),
mean(target.summary["tract_length", col.names])/exp(fit2$coefficients[2]/n1),
mean(target.summary["tract_length", col.names])/exp(fit3$coefficients[2]/n1)))
print.matrix <- matrix(c(mean(target.summary["tract_length", col.names]), sd(target.summary["tract_length", col.names]),
mean(first.half.corrected), sd(first.half.corrected),
mean(second.half.corrected), sd(second.half.corrected),
mean((first.half.corrected + second.half.corrected) / 2),
sd(first.half.corrected + second.half.corrected) / 2),
4, 2, byrow = TRUE)
colnames(print.matrix) <- c("mean", "sd")
rownames(print.matrix) <- c("PSJS estimated", "Corrected using first half",
"Corrected using second half", "Corrected using both")
print(print.matrix)
}
## (Intercept) n.list
## -111.603087 3.726936
## n.list x1 (Intercept)
## 41.55159 41.60182 41.68563
## (Intercept) x2 x3
## 45.70088 45.73770 45.79297
## mean sd
## PSJS estimated 36.375314 16.56685
## Corrected using first half 6.445536 190.26919
## Corrected using second half 25.431103 266.56870
## Corrected using both 15.938320 161.22476
## (Intercept) n.list
## -115.339877 4.273891
## n.list x1 (Intercept)
## 71.80047 71.75102 71.66871
## (Intercept) x2 x3
## 80.38182 80.34491 80.28960
## mean sd
## PSJS estimated 63.49233 31.42831
## Corrected using first half 62.39591 675.89787
## Corrected using second half 47.71725 190.24342
## Corrected using both 55.05658 349.87868
## (Intercept) n.list
## 33.209189 4.646589
## n.list x1 (Intercept)
## 104.2288 104.1801 104.0990
## (Intercept) x2 x3
## 235.3034 235.2300 235.1201
## mean sd
## PSJS estimated 251.8385122 746.9723
## Corrected using first half -0.6239804 915.4536
## Corrected using second half 50.2867915 637.8758
## Corrected using both 24.8314055 572.5871